# download_ohd_issue.py
# OHD (Journal of Open Humanities Data) Downloader
# Automates downloading PDFs from OHD issue pages
# - Parses article titles and direct PDF links from live issue pages
# - Skips non-article entries (About, Contact, Newsletter, etc.)
# - Creates dynamic folders based on Volume/Year from <title>
# - Sanitizes filenames (Windows-safe) and logs all downloads in CSV
# - Works with full live URLs, following /articles/... links


import os
import re
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# ---------- Helpers ----------
def sanitize_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "", name.strip())

def extract_metadata_from_title(title_text):
    # Example: "Volume 6 - 2020 | Journal of Open Humanities Data"
    vol_match = re.search(r'Volume\s+(\d+)', title_text, re.I)
    year_match = re.search(r'(\d{4})', title_text)
    vol = vol_match.group(1) if vol_match else "Vol"
    year = year_match.group(1) if year_match else "Year"
    return f"OHD_Vol{vol}_Issue1_{year}"

# ---------- Input ----------
issue_url = input("Enter OHD issue URL: ").strip()
print(f"[INFO] Fetching: {issue_url}")
resp = requests.get(issue_url)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")

# ---------- Folder Name ----------
title_tag = soup.find("title")
folder_name = extract_metadata_from_title(title_tag.text if title_tag else "")
os.makedirs(folder_name, exist_ok=True)

# ---------- Find article blocks ----------
article_blocks = soup.select("div[data-sentry-component='Title'] a[href^='/en/articles/']")
print(f"[INFO] Found {len(article_blocks)} articles")

log_path = os.path.join(folder_name, f"{folder_name}_log.csv")
log_file = open(log_path, "w", newline="", encoding="utf-8")
csv_writer = csv.writer(log_file)
csv_writer.writerow(["Title", "Article URL", "PDF URL", "Status"])

count = 0
for idx, tag in enumerate(article_blocks, 1):
    title = tag.text.strip()
    article_path = tag.get("href", "")
    
    # Skip non-article or placeholder entries
    if not title or title.lower() in ["about", "contact", "content", "research integrity", "events", "newsletter"]:
        print(f"[SKIP] {title}")
        csv_writer.writerow([title, "", "", "Skipped (Non-article or Empty Title)"])
        continue

    article_url = urljoin(issue_url, article_path)

    # Get article page
    try:
        art_resp = requests.get(article_url)
        art_resp.raise_for_status()
        art_soup = BeautifulSoup(art_resp.text, "html.parser")
        pdf_tag = art_soup.select_one("a[href$='.pdf']")
        if not pdf_tag:
            print(f"[SKIP] No PDF found for: {title}")
            csv_writer.writerow([title, article_url, "", "No PDF"])
            continue
        pdf_url = urljoin(article_url, pdf_tag["href"])
    except Exception as e:
        print(f"[ERROR] {title} - Failed to fetch PDF: {e}")
        csv_writer.writerow([title, article_url, "", f"Error: {e}"])
        continue

    # Download PDF
    try:
        safe_title = sanitize_filename(title)
        pdf_path = os.path.join(folder_name, f"{safe_title}.pdf")
        print(f"[{count+1}] Downloading: {safe_title}")
        r = requests.get(pdf_url)
        r.raise_for_status()
        with open(pdf_path, "wb") as f:
            f.write(r.content)
        csv_writer.writerow([title, article_url, pdf_url, "OK"])
        count += 1
    except Exception as e:
        print(f"[ERROR] Failed: {title} - {e}")
        csv_writer.writerow([title, article_url, pdf_url, f"Error: {e}"])

log_file.close()
print(f"\nDone! {count} PDFs saved in {folder_name}")
print(f"Log file created: {log_path}")
